## Replicate Text and Table Statistics
## Andrew Reeves and Ryan T. Moore
## First: 9 February 2017
## Last: 19 January 2020

## Load libraries:
library(lubridate)
library(Hmisc)
library(plyr)
library(maps)

## Load required data:
load("figure6andInPaperStats.Rdata")

## Our sample of OpenPaths users includes 2.6 million data points from 446 individuals. Note that this is a large file so appropriate memory is required.
length(unique(opLocs$uid))
# [1] 446
nrow(opLocs)
# [1] 2582492

## The number of GPS points for each individual range from 1 to over 111,000
sort(table(opLocs$uid))
# a9d6534e77d9821e36a8c54018ca6458 35a58c5e8ad13653abfb2d361c697b5b ae5947b189629d83124b3892cb825622 
# 1                                2                                2 
# f75ddae405bf893995e26f1915000b56 650c2aad88c1ab665ba20e061e7b6e4e 6fcf8ac99594005c720b6ec3da17c662 
# 2                                3                                3 
# ...
# 1ca991ffa462851212dc5bc560cc8bd5 26a1206d81d73cae47f19f0c790a7a44 22de352b3af53fad2acce888b189c964 
# 39100                            42428                            44052 
# 2735e1baaecc28928c7d2f549982cb42 ad165278adca191026ca006d41ce85a4 
# 50919                           111618 

## with the median number of coordinate pairs being about 3,200.
median(table(opLocs$uid))
# [1] 3182.5

## On average, we have about a year’s worth of geolocation for the individuals (364.4 days) 
##  with a maximum of over four years’ worth of data. 
opLocs$time <- substr(opLocs$date, start = 12, stop = 20)
opLocs$date <- ymd_hms(opLocs$date, tz = "GMT")
opLocs <- subset(opLocs, year(opLocs$date)!=1970)
opLocsMin <- aggregate(opLocs$date, by = list(uid = opLocs$uid), min)
opLocsMax <- aggregate(opLocs$date, by = list(uid = opLocs$uid), max)
opLocsMinMax <- cbind(opLocsMin, opLocsMax$x)
names(opLocsMinMax) <- c("uid", "timeMin", "timeMax")
rm(opLocsMax, opLocsMin)
opLocsMinMax$timeLength <- opLocsMinMax$timeMax - opLocsMinMax$timeMin
opLocsMinMax$dayLength <- as.numeric(difftime(opLocsMinMax$timeMax, opLocsMinMax$timeMin, units=c("days")))
mean(opLocsMinMax$dayLength)
# [1] 364.3989
max(opLocsMinMax$dayLength)
# [1] 1471.068
max(opLocsMinMax$dayLength)/365
# [1] 4.030323

## For just over 1% of our respondents, we have less than a day’s worth of geolocations.
length(opLocsMinMax$uid[opLocsMinMax$dayLength < 1]) / nrow(opLocsMinMax)
# [1] 0.01345291
rm(opLocsMinMax)

## We record coordinates in all fifty states and the District of Columbia. 
table(opLocs$statefp10.county) ## number of points by state fips code (see https://www.census.gov/geo/reference/ansi_statetables.html)
# 01     02     04     05     06     08     09     10     11     12     13     15     16     17     18     19 
# 15126   4343   7108  11812 569120  80482  16186   2946  47284  68574  26774   7299   4998 120822  23229   1229 
# 20     21     22     23     24     25     26     27     28     29     30     31     32     33     34     35 
# 1688   3943  11731   6043  29676 121098  66312  67994   1960  50416   8902    899   5556  21980  97515   2346 
# 36     37     38     39     40     41     42     44     45     46     47     48     49     50     51     53 
# 312125  98035    203  40726   1598  52828 174693   7466   7921    974  59632  90409  29033   5581  44163 110875 
# 54     55     56     72 
# 1448  37679   1446      0 


## Even sparsely-represented states include a few hundred observed points 
##  (North Dakota: 203; Nebraska: 899; South Dakota: 974), 
nrow(opLocs[opLocs$statefp10.county == 38,]) # 38 is ND FIPS code
# [1] 203
nrow(opLocs[opLocs$statefp10.county == 31,]) # 31 is NE FIPS code
# [1] 899
nrow(opLocs[opLocs$statefp10.county == 46,]) # 46 is SD FIPS code
# [1] 974


## and the more commonly-observed states include hundreds of thousands of points 
##  (Pennsylvania: 6% of the total, or 174,693; New York: 12% or 312,129; California: 22% or 569,175). 
nrow(opLocs[opLocs$statefp10.county == "42",]) # 42 is PA FIPS code
# [1] 174693
nrow(opLocs[opLocs$statefp10.county == "42",])/nrow(opLocs)
# [1] 0.0676521
nrow(opLocs[opLocs$statefp10.county == "36",]) # 36 is NY FIPS code
# [1] 312125
nrow(opLocs[opLocs$statefp10.county == "36",])/nrow(opLocs)
# [1] 0.1208744
nrow(opLocs[opLocs$statefp10.county == "06",]) # 06 is CA FIPS code
# [1] 569120
nrow(opLocs[opLocs$statefp10.county == "06",])/nrow(opLocs)
# [1] 0.220399


## Appendix Figure 6
data(state.fips)
freqTable <- data.frame(fips = as.numeric(as.character(names(table(opLocs$statefp10.county)))), 
                        npoints = tabulate(opLocs$statefp10.county)[1:52])
state.fips <- state.fips[, c("fips", "abb")]
state.fips <- unique(state.fips)
freqTable <- merge(freqTable, state.fips, all.x = TRUE, by = "fips")
freqTable$abb <- as.character(freqTable$abb)
freqTable$abb[freqTable$fips == 2] <- "AK" 
freqTable$abb[freqTable$fips == 15] <- "HI" 
freqTable <- freqTable[order(-freqTable$npoints),]
freqTable <- subset(freqTable, !is.na(freqTable$abb))
freqTable = transform(freqTable, abb=reorder(abb, npoints))

pdf(file = "Figure-6.pdf")
ggplot(freqTable)+ geom_point(aes(x = abb, y = log(npoints))) +
  coord_flip() + xlab("State") + ylab("Number of points (logged)") +
  theme( # remove the vertical grid lines
    panel.grid.major.x = element_blank() ,
    # explicitly set the horizontal lines (or they will disappear too)
    panel.grid.major.y = element_line(linetype = 3, color = "darkgray"),
    axis.text.y = element_text(size = rel(0.8)) )  
dev.off()

